Packages

# importing the data
library(readxl) # import
library(tidyverse) # import

# visualisation
library(ggplot2) # ggplot()
library(ggpubr) # ggarrange()
library(ggfortify)
library(kableExtra)

# library(psych) #principal()

# library(esquisse)

theme_set(theme_bw()) # set ggplot to b/w
# options(scipen = 100) # set to 0 to go back to scientific notation
# rm(list = ls())

base_dir <- "~/Documents/repos/dnajc6-larval-behaviour1"

Introduction

Here I will be preparing the raw data obtained from behaviour tests of mutant dnajc6 families at 5 days old for CF to train a neural network with. This data has been summarised for 1 minute time bins. I will be investigating the time bins from 2-5 minutes (time bins 3, 4, and 5, as fish were not always correctly tracked in the first 2 minutes). This will represent their “normal” behaviour in light conditions. Additionally, we will be investigating their behaviour following a sudden light to dark transition (from 5-8 minutes, time bins 6, 7, and 8). I will be looking at both locomotor and anxiety-related features (thigmotaxis).

Importing data

Here I’m importing the behaviour data from .csv files exported from EthoVision XT. This data is being merged with a “metadata” spreadsheet that contains a fish ID, parents and spawn date, genotypes, tray number and position. An additional spreadsheet containing the trial information (trial number, time, and tray in that trial) was joined to the metadata as each tray was subjected to multiple trials. The fish metadata was joined to the behavioural data by trial number and position (in the tray).

Fish ID contains: (individual fish number).(parent pair number)-(day.month)

# getting the file with functions to read in data
source("scripts/import.R")

# here I'm selecting the trials of interest
raw_11 <- read_data("11-Oct-22") %>% subset(.$fish_id %in% grep("6.10", .$fish_id, value = T))

# here I'm selecting only the fish at 5 dpf
raw_12 <- read_data("12-Oct-22") %>% subset(.$fish_id %in% grep("7.10", .$fish_id, value = T))

Untracked

I need to remove fish that were not tracked by the machine.

# fish_ids of larvae that were untracked, or had severe deformities that prevented movement
untracked <- c(
  "2.2-6.10",
  "56.2-6.10"
)

raw_11[raw_11$fish_id %in% untracked,] %>%
  select(fish_id, Genotype, Position, Trial, Time) %>%
  distinct() %>%
  kable(caption = "Larvae omitted due to deformity or tracking problems") %>%
  kable_styling()
Larvae omitted due to deformity or tracking problems
fish_id Genotype Position Trial Time
2.2-6.10 het A2 1 1899-12-31 07:00:00
2.2-6.10 het A2 4 1899-12-31 08:12:00
2.2-6.10 het A2 7 1899-12-31 09:03:00
2.2-6.10 het A2 10 1899-12-31 10:41:00
56.2-6.10 het B2 3 1899-12-31 07:53:00
56.2-6.10 het B2 6 1899-12-31 08:46:00
56.2-6.10 het B2 9 1899-12-31 10:25:00
56.2-6.10 het B2 12 1899-12-31 11:23:00
raw_11 <- raw_11[!raw_11$fish_id %in% untracked,]

# making a merged version of all the raw data
raw_all <- full_join(raw_11, raw_12)
# write_csv(raw_all, "processed/dnajc6 behaviour raw_all.csv")

Genotype proportions

meta <- raw_all %>%
  select(fish_id, Genotype) %>%
  mutate(family = str_split_i(fish_id, "-", 2)) %>%
  unique()

table(meta$Genotype, meta$family)
##      
##       6.10 7.10
##   wt    12   21
##   het   32   30
##   hom   14   28

Summary

Here I will be generating the summary statistics for the raw data.

raw_sum <- do.call(cbind, lapply(raw_all[,-c(1:7)], summary)) %>% t()

raw_sum %>%
  kable(caption = "Raw data summary statistics") %>%
  kable_styling()
Raw data summary statistics
Min. 1st Qu. Median Mean 3rd Qu. Max.
Dist_travelled_total 0.00 4.7558700 4.34408e+01 7.520819e+01 126.0925000 4.20334e+03
Dist_travelled_mean 0.00 0.0031763 2.89652e-02 5.017920e-02 0.0841208 2.80972e+00
Dist_travelled_var 0.00 0.0008894 1.41353e-02 3.385730e-02 0.0428388 1.75365e+01
Velocity_mean 0.00 0.0794066 7.24130e-01 1.254481e+00 2.1030200 7.02430e+01
Velocity_var 0.00 0.5558590 8.83455e+00 2.116079e+01 26.7742000 1.09603e+04
Time_moving_total 0.00 0.0000000 8.24000e+00 1.572139e+01 29.9200000 5.95200e+01
Freq_moving 0.00 0.0000000 1.00000e+01 1.531855e+01 30.0000000 5.90000e+01
Acceleration_max 0.00 225.3200000 6.53241e+02 7.948633e+02 1095.0650000 8.96511e+03
Acceleration_min -8923.78 -807.9240000 -5.01932e+02 -6.381963e+02 -219.3220000 0.00000e+00
Acceleration_var 0.00 564.1380000 5.99109e+03 1.301328e+04 17170.8500000 4.69717e+06
Freq_in_middle 0.00 0.0000000 1.00000e+00 2.250524e+00 4.0000000 1.10000e+02
Time_in_middle_total 0.00 0.0000000 2.48000e+00 9.203332e+00 12.7200000 6.00000e+01
Time_in_middle_mean 0.00 0.0000000 9.92000e-01 4.520307e+00 2.9228550 6.00000e+01
Time_in_middle_var 0.00 0.0000000 0.00000e+00 1.116379e+13 1.5980550 8.88178e+16
Dist_to_middle_total 0.00 2028.7250000 3.31331e+03 3.106325e+03 4294.3850000 6.47753e+03
Dist_to_middle_mean 0.00 1.3534700 2.21022e+00 2.072536e+00 2.8648850 4.31835e+00
Dist_to_middle_var 0.00 0.0081318 4.75711e-01 6.702820e-01 1.2427250 3.99302e+00
Mobility_total 0.00 276.8230000 2.85153e+03 5.044701e+03 8716.9950000 6.71810e+04
Mobility_mean 0.00 0.1851945 1.90150e+00 3.365680e+00 5.8120150 4.47874e+01
Mobility_var 0.00 4.1786800 7.22169e+01 1.264605e+02 218.7490000 2.28910e+03
Meander_total 0.00 354.1540000 4.16458e+03 7.610795e+03 11838.9500000 1.47987e+05
Meander_mean 0.00 29.2001500 6.91981e+01 1.477224e+02 142.6790000 8.95194e+02
Meander_var 0.00 113.7510000 1.18357e+04 2.392162e+04 34660.8500000 2.89999e+05
Heading_mean 0.00 38.5336500 8.54421e+01 8.607118e+01 134.0180000 1.80000e+02
Heading_var 0.00 2566.5300000 2.99861e+03 2.611266e+03 3140.6950000 3.28281e+03

Outliers

Extreme outliers will result in problems with training the network, so I will be removing larvae with z-scores (absolute number of standard deviations from the mean) greater than 4 (for their genotype). To prevent the loss of too many data points, any values considered to be outliers will be converted to NA here and the row (fish) will only be omitted just before use of the data.

I’m also seeing which fish would be considered outliers if I were to calculate z-scores based on the overall mean rather than the mean for each genotype.

# this calculates the z-scores for each feature
z_scores <- function(data) {
  x <- sapply(data[,-c(1:7)],
         function(x) (abs(x - mean(x)) / sd(x))) %>%
    as.matrix()
}

threshold <- 4

trm_11 <- raw_11
outliers_11 <- c()
for (i in levels(trm_11$Genotype)) {
  # print(i)
  # print(summary(z_scores(trm_11[trm_11$Genotype == i,])))
  
  outliers_11[i] <- sum(rowSums(z_scores(trm_11[trm_11$Genotype == i,]) > threshold) > 0)
  
  trm_11[trm_11$Genotype == i,] <- trm_11[trm_11$Genotype == i,] %>%
    mutate(replace(.[,-c(1:7)], z_scores(.) > threshold, NA))
}
# outliers_11

trm_12 <- raw_12
outliers_12 <- c()
for (i in levels(trm_12$Genotype)) {
  # print(i)
  # print(summary(z_scores(trm_12[trm_12$Genotype == i,])))
  
  outliers_12[i] <- sum(rowSums(z_scores(trm_12[trm_12$Genotype == i,]) > threshold) > 0)
  
  trm_12[trm_12$Genotype == i,] <- trm_12[trm_12$Genotype == i,] %>%
    mutate(replace(.[,-c(1:7)], z_scores(.) > threshold, NA))
}
# outliers_12

trm_all <- full_join(trm_11, trm_12)

# write_csv(trm_all, "processed/dnajc6 behaviour trm_all.csv")

###
# Outlier removal using the overall mean

trm_11b <- raw_11 %>%
  mutate(replace(.[,-c(1:7)], z_scores(.) > threshold, NA))

trm_12b <- raw_12 %>%
  mutate(replace(.[,-c(1:7)], z_scores(.) > threshold, NA))

trm_all2 <- full_join(trm_11b, trm_12b)

Summary

outliers_sum <- rbind(outliers_11, outliers_12) %>%
  `rownames<-`(c("11 Oct", "12 Oct")) %>%
  cbind(Total = rowSums(.)) %>%
  rbind(Total = colSums(.))

outliers_sum %>% 
  kable(caption = "Number of outliers removed from each family") %>%
  kable_styling()
Number of outliers removed from each family
wt het hom Total
11 Oct 56 217 64 337
12 Oct 85 134 117 336
Total 141 351 181 673

Here, I’m going to make a table containing all outlier fish so I can have a better look at them.

outliers <- anti_join(raw_all, trm_all) %>% droplevels()

# number of unique fish IDs
length(levels(outliers$fish_id))
## [1] 131
df <- data.frame(table(outliers$fish_id)) %>%
  `colnames<-`(c("fish_id", "Freq")) %>%
  # adding genotypes
  left_join(outliers %>% select(fish_id, Genotype) %>% distinct()) %>%
  mutate(Family = str_extract(fish_id, "[^-]+$") %>% as.factor())

# df$Freq %>% summary()

x <- ggplot(df, aes(x = Freq)) +
  geom_dotplot(aes(fill = Genotype, colour = Genotype), dotsize = 0.75) +
  ggtitle("Group means")

###

outliers2 <- anti_join(raw_all, trm_all2) %>% droplevels()

# number of unique fish IDs
length(levels(outliers2$fish_id)) # less outliers
## [1] 129
df2 <- data.frame(table(outliers2$fish_id)) %>%
  `colnames<-`(c("fish_id", "Freq")) %>%
  # adding genotypes
  left_join(outliers2 %>% select(fish_id, Genotype) %>% distinct()) %>%
  mutate(Family = str_extract(fish_id, "[^-]+$") %>% as.factor())

# df2$Freq %>% summary()

y <- ggplot(df2, aes(x = Freq)) +
  geom_dotplot(aes(fill = Genotype, colour = Genotype), dotsize = 0.75) +
  ggtitle("Overall mean")

ggarrange(x, y, common.legend = T)

There are 131 fish with at least one extreme measurement, which is over half (59.6%) of the total fish tested. Most fish were outliers for ~5 of their time bins. The majority of fish that were outliers for only some time bins (<6) were homozygous, and those with extreme values for the majority of their bins (>6) were heterozygous.

Dotplot of which time bin each extreme value is from:

ggarrange(
  ggplot(outliers, aes(x = Bin)) +
    geom_dotplot(aes(fill = Genotype, colour = Genotype), 
                 dotsize = 0.5, stackgroups = T) +
    ggtitle("Group means"),
  
  ggplot(outliers2, aes(x = Bin)) +
    geom_dotplot(aes(fill = Genotype, colour = Genotype), 
                 dotsize = 0.5, stackgroups = T) +
    ggtitle("Overall mean"),
  common.legend = T
)

This shows that the vast majority of the outliers are in the light time periods. I likely need to look into ways to transform the data rather than removing outliers.

# features <- colnames(raw_all)[-c(1:7)]
# 
# for (i in features) {
#   boxplot(raw_all[[i]] ~ raw_all$Bin,
#           main = i)
# }

Processing

Light data

Here I will select locomotor features, subset the light bins of interest, then calculate the average value for each feature in the light. Rather than omitting the outliers, I just averaged their results (so some fish may have the average of less than 3 time bins).

light_bins <- c(3, 4, 5)

light_data <- trm_all %>% #select(all_of(locomotor_feat)) %>%
  subset(.$Bin %in% light_bins) %>% droplevels() %>% na.omit()

mean_light <- light_data %>%
  group_by(fish_id, Trial, Genotype) %>%
  summarise_if(is.numeric, mean) %>% ungroup()

Dark data

I’m also going to select all features in the dark for time bins 5, 6, and 7. These will be averaged for each fish as before.

dark_bins <- c(6, 7, 8)

dark_data <- trm_all %>%
  subset(.$Bin %in% dark_bins) %>% droplevels() %>% na.omit()

mean_dark <- dark_data %>%
  group_by(fish_id, Trial, Genotype) %>%
  summarise_if(is.numeric, mean) %>% ungroup()

Merging data

I will now merge the light and dark features for each fish, with separate columns for their performance in each condition.

mrclean <- full_join(mean_light, mean_dark,
                  by = c("fish_id", "Trial", "Genotype"),
                  suffix = c("", ".dark")) # no suffix for light features

# adding an extra column so CF can split the data by fish family
mrclean <- mrclean %>%
  mutate(Family = str_extract(fish_id, "[^-]+$") %>% as.factor()) %>%
  relocate(Family, .after = fish_id)

# saveRDS(mrclean, "processed/mrclean.rds")
# write_csv(mrclean, "processed/mrclean.csv")

Visualisation

Over time

source(file.path(base_dir, "scripts/graphs.R"))

features <- colnames(raw_all)[-c(1:7)]

plots_11 <- lapply(features, plot_raw, data = trm_11[!trm_11$Genotype == "het",] %>% na.omit())
plots_12 <- lapply(features, plot_raw, data = trm_12[!trm_12$Genotype == "het",] %>% na.omit())

plots_compare <- c(rbind(plots_11, plots_12))

ggarrange(plotlist = plots_compare, ncol = 2, common.legend = T)
## $`1`

## 
## $`2`

## 
## $`3`

## 
## $`4`

## 
## $`5`

## 
## $`6`

## 
## $`7`

## 
## $`8`

## 
## $`9`

## 
## $`10`

## 
## $`11`

## 
## $`12`

## 
## $`13`

## 
## $`14`

## 
## $`15`

## 
## $`16`

## 
## $`17`

## 
## $`18`

## 
## $`19`

## 
## $`20`

## 
## $`21`

## 
## $`22`

## 
## $`23`

## 
## $`24`

## 
## $`25`

## 
## attr(,"class")
## [1] "list"      "ggarrange"

Group means

features <- colnames(mrclean)[-c(1:4)]

plots_all <- lapply(features, plot_data2, data = mrclean[!mrclean$Genotype == "het",] %>% na.omit())

ggarrange(plotlist = plots_all, ncol = 2, common.legend = T)
## $`1`

## 
## $`2`

## 
## $`3`

## 
## $`4`

## 
## $`5`

## 
## $`6`

## 
## $`7`

## 
## $`8`

## 
## $`9`

## 
## $`10`

## 
## $`11`

## 
## $`12`

## 
## $`13`

## 
## $`14`

## 
## $`15`

## 
## $`16`

## 
## $`17`

## 
## $`18`

## 
## $`19`

## 
## $`20`

## 
## $`21`

## 
## $`22`

## 
## $`23`

## 
## $`24`

## 
## $`25`

## 
## attr(,"class")
## [1] "list"      "ggarrange"
# data <- trm_all[!trm_all$Genotype == "het",] %>% subset(Bin %in% c(2:7))
# 
# ggplot(data, aes(x = Genotype, y = Dist_travelled_total, fill = Bin)) +
#     geom_jitter(aes(colour = Genotype), size = 2, alpha = 0.5) +
#     geom_boxplot(aes(colour = Genotype), alpha = 0.25, outlier.shape = NA)
source(file.path(base_dir, "scripts/graphs.R"))
data <- trm_all[!trm_all$Genotype == "het",] %>% droplevels() %>% na.omit()

levels(data$Genotype) <- c("Wildtype", "Mutant")

# a <- plot_raw(data, "Dist_travelled_mean")
# b <- plot_raw(data, "Freq_moving")

panels <- data.frame(start = c(0.5, 5.5, 10.5),
                     end = c(5.5, 10.5, 15.5),
                     colours = factor(c("#FFE961", "#000000", "#FFE961")))

# INTERESTING PLOT
#
# a <- ggplot() +
#     # geom_jitter(data = data, aes(x = Bin, y = Dist_travelled_total, colour = Genotype), alpha = 0.25, size = 2) +
#     # theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1)) +
#     # Line showing the mean for each genotype in each time bin
#     geom_line(
#       data = data, #%>%
#         # group_by(Bin, Genotype) %>%
#         # summarise(g_mean = mean(Dist_travelled_total), .groups = "drop"),
#       aes(x = Bin, y = Dist_travelled_total, group = Genotype, colour = Genotype), size = 1.5) +
#     # Panels showing light/dark periods
#     geom_rect(
#       data = panels,
#       aes(xmin = start, xmax = end, ymin = -Inf, ymax = Inf),
#       fill = panels$colours, alpha = 0.3) +
#     xlab("Time bin (minute)") +
#     ylab("Total distance travelled (mm)") +
#   theme(text = element_text(size=20))

# b <- ggplot() +
#     geom_jitter(data = data, aes(x = Bin, y = Dist_travelled_total, colour = Genotype), alpha = 0.25) +
#     # theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1)) +
#     # Line showing the mean for each genotype in each time bin
#   # scale_colour_gradient() +  
#   geom_line(
#       data = data %>%
#         group_by(Trial) %>%
#         # summarise(t_mean = mean(Dist_travelled_total), .groups = "drop") %>%
#         mutate(t_mean = mean(Dist_travelled_total),
#                geno_prop = prop.table(table(Genotype))[1]),
#       aes(x = Bin, y = t_mean, group = Trial, colour = geno_prop), size = 1) +
#     # Panels showing light/dark periods
#     geom_rect(
#       data = panels,
#       aes(xmin = start, xmax = end, ymin = -Inf, ymax = Inf),
#       fill = panels$colours, alpha = 0.3) +
#     xlab("Time bin (minute)") +
#     ylab("Frequency moving")

# fig <- ggarrange(a, b, common.legend = T)

# ggsave("raw_data.png", plot = a)